HW 01

Author

Joey Garcia

0 - Setup

if (!require("pacman")) 
  install.packages("pacman")
Loading required package: pacman
# use this line for installing/loading
pacman::p_load(tidyverse, lubridate, glue, scales, dplyr, ggthemes, ggrepel, grid, gridExtra, openintro)

devtools::install_github("tidyverse/dsbox")
Using GitHub PAT from the git credential store.
Skipping install of 'dsbox' from a github remote, the SHA1 (244ecdfe) has not changed since last install.
  Use `force = TRUE` to force installation
# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set width of code output
options(width = 65)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)

1 - Road traffic accidents in Edinburgh

accidents <- read.csv("data/accidents.csv")

glimpse(accidents)
Rows: 768
Columns: 31
$ id                 <chr> "2018950000002", "2018950000006", "2…
$ easting            <int> 327174, 324874, 330500, 321890, 3201…
$ northing           <int> 670941, 672457, 671750, 671640, 6693…
$ longitude          <dbl> -3.167032, -3.204252, -3.114026, -3.…
$ latitude           <dbl> 55.92600, 55.93926, 55.93376, 55.931…
$ police_force       <int> 95, 95, 95, 95, 95, 95, 95, 95, 95, …
$ severity           <chr> "Slight", "Slight", "Slight", "Sligh…
$ vehicles           <int> 1, 1, 2, 3, 2, 3, 1, 1, 1, 2, 2, 1, …
$ casualties         <int> 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
$ date               <chr> "31/12/2018", "30/12/2018", "03/01/2…
$ day_of_week        <chr> "Monday", "Sunday", "Wednesday", "Mo…
$ time               <chr> "14:59:00", "12:50:00", "14:34:00", …
$ district           <int> 923, 923, 923, 923, 923, 923, 923, 9…
$ highway            <chr> "S12000036", "S12000036", "S12000036…
$ first_road_class   <chr> "Unclassified", "Unclassified", "A(M…
$ first_road_number  <int> 0, 0, 6095, 71, 0, 720, 0, 0, 1, 700…
$ road_type          <chr> "Single carriageway", "Single carria…
$ speed_limit        <int> 20, 20, 20, 30, 30, 70, 20, 30, 20, …
$ junction_detail    <chr> "Other junction", "Other junction", …
$ junction_control   <chr> "Give way or uncontrolled", "Give wa…
$ second_road_class  <chr> "Unclassified", "Missing / Out of ra…
$ second_road_number <int> 0, -1, 6106, 0, 0, 720, 0, 0, 0, 700…
$ ped_cross_human    <chr> "None within 50 metres", "None withi…
$ ped_cross_physical <chr> "Pedestrian phase at traffic signal …
$ light              <chr> "Daylight", "Daylight", "Daylight", …
$ weather            <chr> "Fine + no high winds", "Fine + no h…
$ road_surface       <chr> "Dry", "Dry", "Wet or damp", "Wet or…
$ special_condition  <chr> "None", "None", "None", "None", "Non…
$ hazard             <chr> "None", "None", "None", "None", "Non…
$ urban_rural        <int> 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, …
$ police             <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "…
unique(accidents$day_of_week)
[1] "Monday"    "Sunday"    "Wednesday" "Thursday"  "Friday"   
[6] "Tuesday"   "Saturday" 
unique(accidents$severity)
[1] "Slight"  "Fatal"   "Serious"
# Identify weekends and weekday & convert time into numerical values
accidents <- accidents |>
  mutate(
    # Identify Weekends
    in_weekend = case_when(
      day_of_week %in% c('Saturday', 'Sunday') ~ 1,
      TRUE ~ 0
    ),
    day_type = if_else(in_weekend == 1, "Weekend", "Weekday"), 
    
    # Convert time into a numerical value for density plot
    time_parsed = parse_date_time(time, orders = 'HMS'), # parse by Hour:Min:Sec
    time_decimal = hour(time_parsed) + minute(time_parsed) / 60 + second(time_parsed) / 3600
  )

# Density plot of accident severity throughout a day
accidents |>
  ggplot(aes(x=time_decimal, fill = severity)) +
  geom_density(alpha= 0.6) +
  facet_wrap(~day_type, ncol = 1) +
  scale_x_continuous(breaks = seq(0, 24, by = 2)) +
  scale_color_colorblind() +
  labs(
    x = "Time of day (Hours)",
    y = "Density",
    color = NULL, linetype = NULL,
    title = "Number of accidents throughout the day",
    subtitle = "By day of week and severity"
    ) +
  theme_minimal()

2 - NYC marathon winners

marathon <- read.csv('data/nyc_marathon.csv')
glimpse(marathon)
Rows: 102
Columns: 7
$ year     <int> 1970, 1970, 1971, 1971, 1972, 1972, 1973, 1973…
$ name     <chr> "Gary Muhrcke", NA, "Norman Higgins", "Beth Bo…
$ country  <chr> "United States", NA, "United States", "United …
$ time     <chr> "02:31:38", NA, "02:22:54", "02:55:22", "02:27…
$ time_hrs <dbl> 2.527222, NA, 2.381667, 2.922778, 2.464444, 3.…
$ division <chr> "Men", "Women", "Men", "Women", "Men", "Women"…
$ note     <chr> "Course record", "No woman finishers", "Course…
sum(is.na(marathon$time)) # counts NA rows
[1] 3
marathon <- marathon |>
  drop_na(time_hrs, )
glimpse(marathon)
Rows: 99
Columns: 7
$ year     <int> 1970, 1971, 1971, 1972, 1972, 1973, 1973, 1974…
$ name     <chr> "Gary Muhrcke", "Norman Higgins", "Beth Bonner…
$ country  <chr> "United States", "United States", "United Stat…
$ time     <chr> "02:31:38", "02:22:54", "02:55:22", "02:27:52"…
$ time_hrs <dbl> 2.527222, 2.381667, 2.922778, 2.464444, 3.1447…
$ division <chr> "Men", "Men", "Women", "Men", "Women", "Men", …
$ note     <chr> "Course record", "Course record", "World recor…
summary(marathon)
      year          name             country         
 Min.   :1970   Length:99          Length:99         
 1st Qu.:1982   Class :character   Class :character  
 Median :1995   Mode  :character   Mode  :character  
 Mean   :1995                                        
 3rd Qu.:2007                                        
 Max.   :2020                                        
     time              time_hrs       division        
 Length:99          Min.   :2.085   Length:99         
 Class :character   1st Qu.:2.166   Class :character  
 Mode  :character   Median :2.386   Mode  :character  
                    Mean   :2.350                     
                    3rd Qu.:2.456                     
                    Max.   :3.145                     
     note          
 Length:99         
 Class :character  
 Mode  :character  
                   
                   
                   

(A)

In the histogram, it’s apparent that there are 2 major inflection points in the dataset, this difference is what separates the times between the Men and Women divisions. In the boxplot, we are able to see the median value and outliers of the dataset.

marathon |>
  ggplot(aes(x=time_hrs)) +
  geom_histogram(binwidth = 0.1, fill='cornsilk2') +
  labs(
    x= 'Finish Time (Hours)', 
    y= 'Count',
    title = 'Marathon Finish Times', 
    subtitle = 'Histogram of all divisions'
  ) +
  theme_minimal()

marathon |>
  ggplot(aes(y=time_hrs)) +
  geom_boxplot(alpha=0.6, fill='skyblue') +
  labs(
    y= 'Finish Time (Hours)',
    title = 'Marathon Finish Times', 
    subtitle = 'Boxplot of all divisions'
  ) +
  theme_minimal()

(B)

When comparing the Men and Women divisions, we can see that the medians of the boxplots are more distinct. There is less variation in time in each boxplot. The Men division has a faster average marathon time than the woman division.

marathon |>
  ggplot(aes(y=time_hrs, fill = division)) +
  geom_boxplot(alpha=0.6) +
  facet_wrap(~ division) +
  scale_fill_manual(values = c("Men" = "skyblue", "Women" = "cornsilk4")) +
  labs(
    y= 'Finish Time (Hours)',
    title = 'Marathon Finish Times', 
    subtitle = 'Boxplots of Men and Women division', 
    fill = 'Division'
  ) +
  theme_minimal()

(C)

In themes, we are able to remove the legend and the x axis markers. The legend is removed because the boxplot title is more than sufficient for clearly representing the boxplot. The x-axis doesn’t have graphical significance in portraying our message of the boxplot.

marathon |>
  ggplot(aes(y=time_hrs, fill = division)) +
  geom_boxplot(alpha=0.6) +
  facet_wrap(~ division) +
  scale_fill_manual(values = c("Men" = "skyblue", "Women" = "cornsilk4")) +
  labs(
    y= 'Finish Time (Hours)',
    title = 'Marathon Finish Times', 
    subtitle = 'Boxplots of Men and Women division', 
    fill = 'Division'
  ) +
  theme_minimal() + 
  theme(
    legend.position = "none",
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )

(D)

In this line plot, we are able to see the disparity in the two divisions. We are able to visually see a negative trend,indicating that over time, the two divisions are finishing the marathon faster.

marathon |>
  ggplot(aes(x= year, y= time_hrs, colour = division)) +
  geom_line() +
  geom_point(aes(shape= division), size= 2, alpha= 0.6) +
  scale_color_manual(values = c("Men" = "skyblue", "Women" = "cornsilk4")) +
  labs(
    x= 'Year',
    y= 'Finish Time (Hours)',
    title = 'Marathon Finish Times', 
    subtitle = 'Time-series of Men and Women division'
  ) +
  theme_minimal() 

3 - US counties

data("county")

county <- county |>
  drop_na()

glimpse(county)
Rows: 2,560
Columns: 15
$ name              <chr> "Autauga County", "Baldwin County", "…
$ state             <fct> Alabama, Alabama, Alabama, Alabama, A…
$ pop2000           <dbl> 43671, 140415, 29038, 20826, 51024, 1…
$ pop2010           <dbl> 54571, 182265, 27457, 22915, 57322, 1…
$ pop2017           <int> 55504, 212628, 25270, 22668, 58013, 1…
$ pop_change        <dbl> 1.48, 9.19, -6.22, 0.73, 0.68, -2.28,…
$ poverty           <dbl> 13.7, 11.8, 27.2, 15.2, 15.6, 28.5, 1…
$ homeownership     <dbl> 77.5, 76.7, 68.0, 82.9, 82.0, 76.9, 7…
$ multi_unit        <dbl> 7.2, 22.6, 11.1, 6.6, 3.7, 9.9, 8.7, …
$ unemployment_rate <dbl> 3.86, 3.99, 5.90, 4.39, 4.02, 4.93, 4…
$ metro             <fct> yes, yes, no, yes, yes, no, no, no, y…
$ median_edu        <fct> some_college, some_college, hs_diplom…
$ per_capita_income <dbl> 27841.70, 27779.85, 17891.73, 20572.0…
$ median_hh_income  <int> 55317, 52562, 33368, 43404, 47412, 29…
$ smoking_ban       <fct> none, none, partial, none, none, none…
summary(county)
     name                state         pop2000       
 Length:2560        Texas   : 235   Min.   :    444  
 Class :character   Georgia : 143   1st Qu.:  10638  
 Mode  :character   Virginia: 122   Median :  23590  
                    Kentucky: 106   Mean   :  84569  
                    Missouri: 100   3rd Qu.:  56909  
                    Kansas  :  95   Max.   :9519338  
                    (Other) :1759                    
    pop2010           pop2017           pop_change      
 Min.   :    460   Min.   :     457   Min.   :-33.6300  
 1st Qu.:  10803   1st Qu.:   10653   1st Qu.: -1.9400  
 Median :  24767   Median :   24968   Median : -0.0100  
 Mean   :  93788   Mean   :   99575   Mean   :  0.5836  
 3rd Qu.:  63126   3rd Qu.:   64113   3rd Qu.:  2.4925  
 Max.   :9818605   Max.   :10163507   Max.   : 37.1900  
                                                        
    poverty      homeownership     multi_unit   
 Min.   : 2.40   Min.   :22.80   Min.   : 0.00  
 1st Qu.:11.60   1st Qu.:69.20   1st Qu.: 5.90  
 Median :15.60   Median :74.40   Median : 9.25  
 Mean   :16.28   Mean   :73.25   Mean   :11.90  
 3rd Qu.:19.80   3rd Qu.:78.40   3rd Qu.:15.40  
 Max.   :48.20   Max.   :91.30   Max.   :98.50  
                                                
 unemployment_rate metro             median_edu  
 Min.   : 1.620    no :1615   below_hs    :   1  
 1st Qu.: 3.507    yes: 945   hs_diploma  :1187  
 Median : 4.330               some_college:1335  
 Mean   : 4.600               bachelors   :  37  
 3rd Qu.: 5.312                                  
 Max.   :19.070                                  
                                                 
 per_capita_income median_hh_income   smoking_ban  
 Min.   :10467     Min.   : 19264   none    :1925  
 1st Qu.:21494     1st Qu.: 40598   partial : 635  
 Median :24926     Median : 47132   complete:   0  
 Mean   :25758     Mean   : 49047                  
 3rd Qu.:29036     3rd Qu.: 55109                  
 Max.   :69533     Max.   :129588                  
                                                   

(A)

The below code works, but it does a poor job of explaining how the response “median_hh_income” relates to the variable “median_edu”. The code adds the box plot and extends the x-axis variables, thus not raising conflicts between the two plots. The y-axis is significantly stretched because the maximum values of pop2017 (max = 10163507) are greater than median_hh_income (max = 129588). All in all, the code works, but the graph does not make sense.

ggplot(county) +
  geom_point(aes(x = median_edu, y = median_hh_income)) +
  geom_boxplot(aes(x = smoking_ban, y = pop2017))

(B)

Of the two figures below, the 2nd figure best compares poverty levels to different median education levels. In the 2nd figure, we are able to distinguish the higher poverty levels per education level. Our choice is critical when picking how we want to facet our graphs. When faceting, this will stretch our graphs in directions that may obscure our data, this will inherently change how our graph’s message is perceived.

ggplot(county %>% filter(!is.na(median_edu))) + 
  geom_point(aes(x = homeownership, y = poverty)) + 
  facet_grid(median_edu ~ .)

ggplot(county %>% filter(!is.na(median_edu))) + 
  geom_point(aes(x = homeownership, y = poverty)) + 
  facet_grid(. ~ median_edu)

(C)

county |>
  ggplot(aes(x= homeownership, y= poverty)) +
  geom_point() +
  labs(
    title= 'Plot A'
  ) + 
  theme_minimal()

county |>
  ggplot(aes(x= homeownership, y= poverty)) +
  geom_point() +
  geom_smooth(se= FALSE) +
  labs(
    title= 'Plot B'
  )
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs
= "cs")'

county |>
  ggplot(aes(x= homeownership, y= poverty)) +
  geom_point() +
  geom_smooth(aes(colour=metro), se= FALSE) +
  scale_color_manual(values = c("no" = "green", "yes" = "green")) +
  labs(
    title= 'Plot C'
  ) +
  theme_minimal() +
  theme(legend.position = "none")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs
= "cs")'

county |>
  ggplot(aes(x= homeownership, y= poverty)) +
  geom_smooth(aes(colour=metro), se= FALSE) +
  geom_point() +
  scale_color_manual(values = c("no" = "blue", "yes" = "blue")) +
  labs(
    title= 'Plot D'
  ) +
  theme_minimal() +
  theme(legend.position = "none")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs
= "cs")'

county |>
  ggplot(aes(x = homeownership, y = poverty)) +
  geom_point(aes(colour = metro)) +
  geom_smooth(aes(linetype = metro), colour = "blue", se = FALSE) +
  scale_linetype_manual(name = "metro ", values = c("no" = "solid", "yes" = "dashed")) +
  labs(
    title = 'Plot E'
  ) +
  theme_minimal()+ 
  guides(
    linetype = guide_legend(order = 1)
  )
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs
= "cs")'

county |>
  ggplot(aes(x= homeownership, y= poverty, colour=metro)) +
  geom_point() +
  geom_smooth(se= FALSE) +
  labs(
    title= 'Plot F'
  ) +
  theme_minimal()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs
= "cs")'

county |>
  ggplot(aes(x= homeownership, y= poverty)) +
  geom_point(aes(colour= metro)) +
  geom_smooth(se= FALSE) +
  labs(
    title= 'Plot F'
  ) +
  theme_minimal()
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs
= "cs")'

county |>
  ggplot(aes(x= homeownership, y= poverty)) +
  geom_point(aes(colour= metro)) +
  labs(
    title= 'Plot H'
  ) +
  theme_minimal()

4 - Rental apartments in SF

credit <- read.csv('data/credit.csv')
glimpse(credit)
Rows: 400
Columns: 5
$ balance <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1…
$ income  <dbl> 14.891, 106.025, 104.593, 148.924, 55.882, 80.1…
$ student <chr> "No", "Yes", "No", "No", "No", "No", "No", "No"…
$ married <chr> "Yes", "Yes", "No", "No", "Yes", "No", "No", "N…
$ limit   <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114,…
summary(credit)
    balance            income         student         
 Min.   :   0.00   Min.   : 10.35   Length:400        
 1st Qu.:  68.75   1st Qu.: 21.01   Class :character  
 Median : 459.50   Median : 33.12   Mode  :character  
 Mean   : 520.01   Mean   : 45.22                     
 3rd Qu.: 863.00   3rd Qu.: 57.47                     
 Max.   :1999.00   Max.   :186.63                     
   married              limit      
 Length:400         Min.   :  855  
 Class :character   1st Qu.: 3088  
 Mode  :character   Median : 4622  
                    Mean   : 4736  
                    3rd Qu.: 5873  
                    Max.   :13913  

(A)

In all instances, if the person is married or a student, there is a positive trend across all plots. The positive trends indicate the higher the income then the higher the credit card balance.

credit |>
  ggplot(aes(x= income, y= balance, colour = student)) +
  geom_point(aes(shape = student), alpha= 0.6) +
  geom_smooth(method= 'lm', se= FALSE) +
  scale_color_manual(values = c("No" = "skyblue", "Yes" = "red4")) +
  
  scale_x_continuous(labels = label_dollar(suffix = 'K')) +
  scale_y_continuous(labels = label_dollar()) +
  
  facet_grid(
    student ~ married, 
    labeller = labeller(
      student = c('Yes' = 'Student: Yes', 'No' = 'Student: No'),
      married = c('Yes' = 'Married:yes', 'No' = 'Married:No')
    )
  ) +
  
  labs(
    x= 'Income',
    y= 'Credit Card Balance'
  ) +
  
  theme_minimal() +
  theme(
    legend.position = 'none', 
    panel.border = element_rect(colour = 'black', fill= NA, linewidth = 1), 
    strip.background = element_rect(colour = 'black', fill = 'gray', linewidth = 1)
  )
`geom_smooth()` using formula = 'y ~ x'

(B)

Based on the plot, the predictors ‘income’, ‘marriage’, and ‘student’ are not strong predictors for figuring out the credit ‘balance’. The trends are very similar, the y-intercepts are unique in their subplots, but they generally have the same linear regression line.

Based on the statistical summary below, we can see that the ‘student’ predictor shows statistical significance with a very small p value less the 0.05. The ‘married’ predictor has a p-value of 0.948 which indicates that it’s statistically less significant. ‘student’ is a useful predictor.

model_income <- lm(balance ~ income, data = credit)
summary(model_income)

Call:
lm(formula = balance ~ income, data = credit)

Residuals:
    Min      1Q  Median      3Q     Max 
-803.64 -348.99  -54.42  331.75 1100.25 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 246.5148    33.1993   7.425  6.9e-13 ***
income        6.0484     0.5794  10.440  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 407.9 on 398 degrees of freedom
Multiple R-squared:  0.215, Adjusted R-squared:  0.213 
F-statistic:   109 on 1 and 398 DF,  p-value: < 2.2e-16
model_full <- lm(balance ~ income + student + married, data = credit)
summary(model_full)

Call:
lm(formula = balance ~ income + student + married, data = credit)

Residuals:
   Min     1Q Median     3Q    Max 
-764.1 -330.5  -45.0  324.6  819.2 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 212.7342    40.5903   5.241 2.60e-07 ***
income        5.9857     0.5577  10.733  < 2e-16 ***
studentYes  382.3369    65.5914   5.829 1.16e-08 ***
marriedYes   -2.6439    40.4084  -0.065    0.948    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 392.3 on 396 degrees of freedom
Multiple R-squared:  0.2775,    Adjusted R-squared:  0.272 
F-statistic: 50.69 on 3 and 396 DF,  p-value: < 2.2e-16
anova(model_income, model_full)
Analysis of Variance Table

Model 1: balance ~ income
Model 2: balance ~ income + student + married
  Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
1    398 66208745                                  
2    396 60938395  2   5270350 17.124 7.365e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(C)

# Data Wrangling
credit <- credit |> 
  mutate(
    credit_utilization = balance / limit
  )

# Credit Utilization vs. Income (incorporating student & marriage predictors)
credit |>
  ggplot(aes(x= income, y= credit_utilization, colour = student)) +
  geom_point(aes(shape = student), alpha= 0.6) +
  geom_smooth(method= 'lm', se= FALSE) +
  scale_color_manual(values = c("No" = "skyblue", "Yes" = "red4")) +
  
  scale_x_continuous(labels = label_dollar(suffix = 'K')) +
  scale_y_continuous(labels = percent) +
  
  facet_grid(
    student ~ married, 
    labeller = labeller(
      student = c('Yes' = 'Student: Yes', 'No' = 'Student: No'),
      married = c('Yes' = 'Married:yes', 'No' = 'Married:No')
    )
  ) +
  
  labs(
    x= 'Income',
    y= 'Credit Utilization'
  ) +
  
  theme_minimal() +
  theme(
    legend.position = 'none', 
    panel.border = element_rect(colour = 'black', fill= NA, linewidth = 1), 
    strip.background = element_rect(colour = 'black', fill = 'gray', linewidth = 1)
  )
`geom_smooth()` using formula = 'y ~ x'

(D)

Based on the plot from part (c), we see higher y-intercepts for credit card utilization of students(‘yes’). Another noticeable change is married and non-married students show a negative trend. This indicates that credit card utilization decreases as income increases for students.

5 - Napoleon’s march.

napoleon <- read_rds('data/napoleon.rds')
glimpse(napoleon)
List of 3
 $ cities      : spc_tbl_ [20 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  ..$ long: num [1:20] 24 25.3 26.4 26.8 27.7 27.6 28.5 28.7 29.2 30.2 ...
  ..$ lat : num [1:20] 55 54.7 54.4 54.3 55.2 53.9 54.3 55.5 54.4 55.3 ...
  ..$ city: chr [1:20] "Kowno" "Wilna" "Smorgoni" "Moiodexno" ...
  ..- attr(*, "spec")=
  .. .. cols(
  .. ..   long = col_double(),
  .. ..   lat = col_double(),
  .. ..   city = col_character()
  .. .. )
  ..- attr(*, "problems")=<externalptr> 
 $ temperatures: spc_tbl_ [9 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  ..$ long : num [1:9] 37.6 36 33.2 32 29.2 28.5 27.2 26.7 25.3
  ..$ temp : num [1:9] 0 0 -9 -21 -11 -20 -24 -30 -26
  ..$ month: chr [1:9] "Oct" "Oct" "Nov" "Nov" ...
  ..$ day  : num [1:9] 18 24 9 14 24 28 1 6 7
  ..$ date : Date[1:9], format: "1812-10-18" ...
  ..- attr(*, "spec")=
  .. .. cols(
  .. ..   long = col_double(),
  .. ..   temp = col_double(),
  .. ..   month = col_character(),
  .. ..   day = col_double(),
  .. ..   date = col_character()
  .. .. )
  ..- attr(*, "problems")=<externalptr> 
 $ troops      : spc_tbl_ [51 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
  ..$ long     : num [1:51] 24 24.5 25.5 26 27 28 28.5 29 30 30.3 ...
  ..$ lat      : num [1:51] 54.9 55 54.5 54.7 54.8 54.9 55 55.1 55.2 55.3 ...
  ..$ survivors: num [1:51] 340000 340000 340000 320000 300000 280000 240000 210000 180000 175000 ...
  ..$ direction: chr [1:51] "advancing" "advancing" "advancing" "advancing" ...
  ..$ group    : num [1:51] 1 1 1 1 1 1 1 1 1 1 ...
  ..- attr(*, "spec")=
  .. .. cols(
  .. ..   long = col_double(),
  .. ..   lat = col_double(),
  .. ..   survivors = col_double(),
  .. ..   direction = col_character(),
  .. ..   group = col_double()
  .. .. )
  ..- attr(*, "problems")=<externalptr> 
breaks <- c(100000, 200000, 300000) # Set scale breaks for 'survivor' var

# Plot Troop movements
plot_troops <- napoleon$troops |>
  ggplot(aes(x= long, y= lat)) +
  # Plot Path of troops
  geom_path(
    aes(size = survivors, colour = direction, group = group),
    lineend="round"
  ) +
  # Scale plot
  scale_size(
    name= "Survivors",
    range = c(0.5,5), #scales 'survivors' path; adjust for legend readability
    breaks=breaks,
    labels=comma(breaks) # from scales library
  ) +
  # Add color to Troops by 'direction'
  scale_color_manual(
    name= "Direction", 
    values = c("goldenrod", "black"),
    labels=c("Advance", "Retreat")
  ) +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2
3.4.0.
ℹ Please use `linewidth` instead.
plot_troops

# Plot City names and points over Troops path
plot_troops_cities <- plot_troops +
  # Plot city names and points
  geom_point(
    data = napoleon$cities,
    colour = 'orangered3'
  ) +
  geom_text_repel(
    data = napoleon$cities, 
    aes(label = city),
    colour = 'orangered3',
    family = "serif"
  ) +
  
  # set X axis to match Temperature plot
  coord_cartesian(xlim = c(24, 38)) +
  labs(
    x = NULL,
    y = NULL,
    title = "Napoleon's March",
    subtitle = "1812-13 invasion of Russia, Public Domain"
  ) +
  # disable Legend
  guides(color = FALSE, size = FALSE) +
  theme_minimal() +
  # disable grids
  theme(
    text = element_text(family = "serif"),
    axis.text = element_blank(),
    panel.grid = element_blank()
  )
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use
"none" instead as of ggplot2 3.3.4.
plot_troops_cities

# Plot Temperature path plot
plot_temp <- napoleon$temperatures |>
  # create labels for points
  mutate(
    label = paste0(temp, "°, ", month, ".", day)
  ) |>
  # Temperature path based on Longitude
  ggplot(aes(long, temp)) +
  geom_path(color= "grey", linewidth= 1.5) +
  geom_point(size= 1) +
  geom_text_repel(aes(label= label), size = 2.5) +
  
  # set X axis similar to troop_city plot
  coord_cartesian(xlim = c(24, 37)) +
  scale_y_continuous(position = "right") +
  labs(
    x = NULL, 
    y = "° Celsius",
    caption = "Source: Charles Minard's 1869 chart"
  ) +
  theme_bw() +
  theme(
    text = element_text(family = "serif"),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    axis.text.x = element_blank(), 
    axis.ticks = element_blank(),
    panel.border = element_blank()
  )
plot_temp

# Combine the plots (heights are greater for troop_city)
grid.arrange(plot_troops_cities, plot_temp, nrow = 2, heights = c(3.5, 1.2))
# Add Border on Graphic
grid.rect(width = .99, height = .99, gp = gpar(lwd = 2, col = "gray", fill = NA))

Response

The project is truly inspired by the plot from shakir-flash[1]. For my recreation of Charles Minard’s 1869 chart, I separated each layer(or “plot”) into three chunks. Each chunk is a layer that is later consolidated in the last chunk labeled “Consolidate plots”. The first chunk creates the routes of the troop movements. The second chunk identifies labels and points for the cities, and this is overlaid with the first chunk “troop plot”. The third chunk creates the plot that reflects the temperature (y-value) based on the longitude (x-value). The last chunk, “Consolidate plots”, stacks the two graphs and showcases the final project. Styles for labeling and plotting were inspired by andrewheiss [2].

Citation List

  1. https://github.com/shakir-flash/Napoleons-March-Minard-Visualization

This version of Napoleons March is 2 years old. The majority of my codes framework was received from this repository. Rather than copy/pasting the code, I used my own coding style to breakdown the plots into more intuitive sections.

  1. https://github.com/andrewheiss/fancy-minard

This versions of Napoleons March inspired me to change the formatting of the Temperature plot.